Library

packages

#install.packages("tidyverse")
library(tidyverse)
#BiocManager::install("infercnv")
library(infercnv)
#install.packages("Seurat")
library(Seurat)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("kableExtra")
library(kableExtra)
#install.packages("data.table")
library(data.table)

Wilcoxon Test

plot-state: single normal sample from PC (BRCA1 vs TN)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/brca_output_dir_min_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))


#remove col names
b17 <- b17[2:length(rownames(b17)),]

#deletions
b17del <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tn_output_dir_subset_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
t17 <- t17[2:length(rownames(t17)),]

#deletions
t17del <- t17 %>% 
  mutate(type = gsub("\\..*","", t17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#combine dfs
del <- rbind(b17del, t17del)

#generate plot
plot <- ggplot(del, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
    x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
    y = "Sum of Deletions"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
plot

state: single wilcoxon test

#create separate b1 df
b1del <- b17del%>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1del <- b1del %>% filter(genotype %in% b1.names)
b1del$genotype <- str_sub(b1del$genotype, start = 1, end = 5)
b1del$genotype <- sub("TN_B1", "BRCA1 Tumor", b1del$genotype)

#create separate tn df
tndel <- t17del %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tndel <- tndel %>% filter(genotype %in% tn.names)
tndel$genotype <- str_sub(tndel$genotype, start = 1, end = 2)
tndel$genotype <- sub("TN", "TN Tumor", tndel$genotype)

#make numeric
b1del$state <- as.numeric(b1del$`Summed States`)
tndel$state <- as.numeric(tndel$`Summed States`)

#xilcox test for tumors
w.tum.st <- wilcox.test(b1del$`Summed States`, tndel$state, alternative = "two.sided")
w.tum.st #W = 21200, p-value < 2.2e-16
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b1del$`Summed States` and tndel$state
## W = 21200, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

plot-length: single normal sample from PC (BRCA1 vs TN)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/tn_b1_epi_output_dir_subset_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#deletions
b17del <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tn_output_dir_subset_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
t17 <- t17[2:length(rownames(t17)),]

#deletions
t17del <- t17 %>% 
  mutate(type = gsub("\\..*","", t17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#combine dfs
del <- rbind(b17del, t17del)

#violin plot
plot <- ggplot(del, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
    x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
    y = "Summed Length of Deletions"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
plot

length: single wilcoxon test

#create separate b1 df
b1del <- b17del%>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1del <- b1del %>% filter(genotype %in% b1.names)
b1del$genotype <- str_sub(b1del$genotype, start = 1, end = 5)
b1del$genotype <- sub("TN_B1", "BRCA1 Tumor", b1del$genotype)

#create separate tn df
tndel <- t17del %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tndel <- tndel %>% filter(genotype %in% tn.names)
tndel$genotype <- str_sub(tndel$genotype, start = 1, end = 2)
tndel$genotype <- sub("TN", "TN Tumor", tndel$genotype)

#make numeric
b1del$`Summed Length` <- as.numeric(b1del$`Summed Length`)
tndel$`Summed Length` <- as.numeric(tndel$`Summed Length`)

#xilcox test for tumors
w.tum.ln <- wilcox.test(b1del$`Summed Length`, tndel$`Summed Length`, alternative = "two.sided")
w.tum.ln #W = 23113, p-value = 4.962e-14
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b1del$`Summed Length` and tndel$`Summed Length`
## W = 23113, p-value = 4.962e-14
## alternative hypothesis: true location shift is not equal to 0

plot-state: multiple samples from PC (B1 vs pre-neoplastic)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

b17 <- read.table(file = "~/brca-infercnv/brca_output_dir_pc_multi/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#deletions
b17del <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#generate plot
bplot <- ggplot(b17del, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
        x = "Preneoplastic tissue (B1) and BRCA1 tumors (TN_B1)",
    y = "Sum of Deletions"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
bplot

state: multiple wilcoxon test

#seperate B1 and PN
b1del <- b17del %>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1del <- b1del %>% filter(genotype %in% b1.names)
b1del$genotype <- str_sub(b1del$genotype, start = 1, end = 5)
b1del$genotype <- sub("TN_B1", "BRCA1 Tumor", b1del$genotype)

pndel <- b17del %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pndel <- pndel %>% filter(genotype %in% pn.names)
pndel$genotype <- str_sub(pndel$genotype, start = 1, end = 2)
pndel <- pndel %>% filter(genotype == "B1")
pndel$genotype <- sub("B1", "BRCA1 Preneoplastic", pndel$genotype)

#make numeric
b1del$`Summed States` <- as.numeric(b1del$`Summed States`)
pndel$`Summed States` <- as.numeric(pndel$`Summed States`)

#wilcox test for brca carriers
w.brca.st <- wilcox.test(b1del$`Summed States`, pndel$`Summed States`, alternative = "two.sided")
w.brca.st #W = 156334, p-value < 2.2e-16
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b1del$`Summed States` and pndel$`Summed States`
## W = 156334, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

plot-length: multiple samples from PC (B1 vs pre-neoplastic)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/brca_output_dir_pc_multi/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#deletions
b17del <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#violin plot
bplot <- ggplot(b17del, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
        x = "Preneoplastic tissue (B1) and BRCA1 tumors (TN_B1)",
    y = "Summed Length of Deletions"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
bplot

length: multiple wilcoxon test

#seperate B1 and PN
#create separate b1 df
b1del <- b17del%>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1del <- b1del %>% filter(genotype %in% b1.names)
b1del$genotype <- str_sub(b1del$genotype, start = 1, end = 5)
b1del$genotype <- sub("TN_B1", "BRCA1 Tumor", b1del$genotype)

pndel <- b17del %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pndel <- pndel %>% filter(genotype %in% pn.names)
pndel$genotype <- str_sub(pndel$genotype, start = 1, end = 2)
pndel <- pndel %>% filter(genotype == "B1")
pndel$genotype <- sub("B1", "BRCA1 Preneoplastic", pndel$genotype)

#make numeric
b1del$`Summed Length` <- as.numeric(b1del$`Summed Length`)
pndel$`Summed Length` <- as.numeric(pndel$`Summed Length`)

#wilcox test for brca carriers
w.brca.ln <- wilcox.test(b1del$`Summed Length`, pndel$`Summed Length`, alternative = "two.sided")
w.brca.ln #W = 157272, p-value < 2.2e-16
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b1del$`Summed Length` and pndel$`Summed Length`
## W = 157272, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

plot-state: multiple samples from PC (TN vs normal premenopausal)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tp_subset_output_dir/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
t17 <- t17[2:length(rownames(t17)),]

t17del <- t17 %>% 
  mutate(type = gsub("\\..*","", t17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#generate plot
tplot <- ggplot(t17del, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
        x = "Premenopausal tissue (N) and TN Tumors (TN)",
    y = "Sum of Deletions"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
tplot

state: multiple wilcoxon test

#create separate tn df
tndel <- t17del %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tndel <- tndel %>% filter(genotype %in% tn.names)
tndel$genotype <- str_sub(tndel$genotype, start = 1, end = 2)
tndel$genotype <- sub("TN", "TN Tumor", tndel$genotype)

tpdel <- t17del %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",  "N_0064",  "N_0169")
tpdel <- tpdel %>% filter(genotype %in% tp.names)
tpdel$genotype <- str_sub(tpdel$genotype, start = 1, end = 1)
tpdel$genotype <- sub("N", "Human Premenopausal", tpdel$genotype)

#make numeric
tndel$`Summed States` <- as.numeric(tndel$`Summed States`)
tpdel$`Summed States` <- as.numeric(tpdel$`Summed States`)

#wilcox test for brca carriers
w.tn.st <- wilcox.test(tndel$`Summed States`, tpdel$`Summed States`, alternative = "two.sided")
w.tn.st #W = 1126640457, p-value = 0.6612
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  tndel$`Summed States` and tpdel$`Summed States`
## W = 14274, p-value = 4.961e-06
## alternative hypothesis: true location shift is not equal to 0

plot-length: multiple samples from PC (TN vs normal premenopausal)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tp_subset_output_dir/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))


#remove col names
t17 <- t17[2:length(rownames(t17)),]

#deletions
t17del <- t17 %>% 
  mutate(type = gsub("\\..*","", t17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#violin plot
tplot <- ggplot(t17del, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
        x = "Premenopausal tissue (N) and TN Tumors (TN)",
    y = "Summed Length of Deletions"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
tplot

length: multiple wilcoxon test

#tn df
tndel <- t17del %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tndel <- tndel %>% filter(genotype %in% tn.names)
tndel$genotype <- str_sub(tndel$genotype, start = 1, end = 2)
tndel$genotype <- sub("TN", "TN Tumor", tndel$genotype)

#tp df
tpdel <- t17del %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",  "N_0064",  "N_0169")
tpdel <- tpdel %>% filter(genotype %in% tp.names)
tpdel$genotype <- str_sub(tpdel$genotype, start = 1, end = 1)
tpdel$genotype <- sub("N", "Human Premenopausal", tpdel$genotype)

#make numeric
tndel$`Summed Length` <- as.numeric(tndel$`Summed Length`)
tpdel$`Summed Length` <- as.numeric(tpdel$`Summed Length`)

#wilcox test for brca carriers
w.tn.ln <- wilcox.test(tndel$`Summed Length`, tpdel$`Summed Length`, alternative = "two.sided")
w.tn.ln #W = 1239947246, p-value < 2.2e-16
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  tndel$`Summed Length` and tpdel$`Summed Length`
## W = 14887, p-value = 9.738e-08
## alternative hypothesis: true location shift is not equal to 0

plot-state: single normal sample from PC (Pre-neo vs Human Premeno)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]

#deletions
pn17del <- pn17 %>% 
  mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>%
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>%
  distinct(cell_group_name, .keep_all = TRUE)

#load data
tp17 <- read.table(file = "~/brca-infercnv/tp_output_dir_05_13/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]

tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",
              "N_0064",  "N_0169")

#deletions
tp17del <- tp17 %>% 
  mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>%
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  filter(`Sample Name` %in% tp.names) %>%
  distinct(cell_group_name, .keep_all = TRUE)

#combine dfs
norm17del <- rbind(pn17del, tp17del)

#generate plot
normplot <- ggplot(norm17del, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
    x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
    y = "Sum of Deletions"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
normplot

state: single wilcoxon test

#create separate preneo df
pndel <- pn17del %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pndel <- pndel %>% filter(genotype %in% pn.names)
pndel$genotype <- str_sub(pndel$genotype, start = 1, end = 2)
pndel <- pndel %>% filter(genotype == "B1")
pndel$genotype <- sub("B1", "BRCA1 Preneoplastic", pndel$genotype)

tpdel <- tp17del %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",
              "N_0064",  "N_0169")
tpdel <- tpdel %>% filter(genotype %in% tp.names)
tpdel$genotype <- str_sub(tpdel$genotype, start = 1, end = 1)
tpdel$genotype <- sub("N", "Human Premenopausal", tpdel$genotype)

#make numeric
pndel$`Summed States` <- as.numeric(pndel$`Summed States`)
tpdel$`Summed States` <- as.numeric(tpdel$`Summed States`)

#xilcox test for tumors
w.norm.st <- wilcox.test(pndel$`Summed States`, tpdel$`Summed States`, alternative = "two.sided")
w.norm.st #W = 96990, p-value = 0.4548
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  pndel$`Summed States` and tpdel$`Summed States`
## W = 103300, p-value = 0.741
## alternative hypothesis: true location shift is not equal to 0

plot-length: single normal sample from PC (Preneo vs Human Premeno)

#extract data from brca1 preneo
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                   col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]

#deletions
pn17del <- pn17 %>% 
  mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#extract data from tp meno
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#load data
tp17 <- read.table(file = "~/brca-infercnv/tp_output_dir_05_13/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]

#deletions
tp17del <- tp17 %>% 
  mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE) %>%
  filter(`Sample Name` != "N_1105_epi")

#combine dfs
norm17del <- rbind(pn17del, tp17del)

#violin plot
normplot <- ggplot(norm17del, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
    x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
    y = "Summed Length of Deletions"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
normplot

length: single wilcoxon test

#create separate preneo df
pndel <- pn17del %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pndel <- pndel %>% filter(genotype %in% pn.names)
pndel$genotype <- str_sub(pndel$genotype, start = 1, end = 2)
pndel <- pndel %>% filter(genotype == "B1")
pndel$genotype <- sub("B1", "BRCA1 Preneoplastic", pndel$genotype)

tpdel <- tp17del %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",  "N_0064",  "N_0169")
tpdel <- tpdel %>% filter(genotype %in% tp.names)
tpdel$genotype <- str_sub(tpdel$genotype, start = 1, end = 1)
tpdel$genotype <- sub("N", "Human Premenopausal", tpdel$genotype)

#make numeric
pndel$`Summed Length` <- as.numeric(pndel$`Summed Length`)
tpdel$`Summed Length` <- as.numeric(tpdel$`Summed Length`)

#xilcox test for tumors
w.norm.ln <- wilcox.test(pndel$`Summed Length`, tpdel$`Summed Length`, alternative = "two.sided")
w.norm.ln #W = 101848, p-value = 0.9706
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  pndel$`Summed Length` and tpdel$`Summed Length`
## W = 101848, p-value = 0.9706
## alternative hypothesis: true location shift is not equal to 0

Wilcoxon results table

table of wilcoxon results

#combine each result
brca.st <- list(comparison = "BRCA1 Tumor vs Preneoplastic",
                statistic = w.brca.st$statistic,
                p.value = w.brca.st$p.value,
                event = "deletion",
                value.type = "state")
brca.st <- as.data.frame(brca.st, row.names = NULL)

tum.st <- list(comparison = "BRCA1 Tumor vs TN tumor",
                statistic = w.tum.st$statistic,
                p.value = w.tum.st$p.value,
                event = "deletion",
                value.type = "state")
tum.st <- as.data.frame(tum.st, row.names = NULL)

brca.ln <- list(comparison = "BRCA1 Tumor vs Preneoplastic",
                statistic = w.brca.ln$statistic,
                p.value = w.brca.ln$p.value,
                event = "deletion",
                value.type = "length")
brca.ln <- as.data.frame(brca.ln, row.names = NULL)

tum.ln <- list(comparison = "BRCA1 Tumor vs TN tumor",
                statistic = w.tum.ln$statistic,
                p.value = w.tum.ln$p.value,
                event = "deletion",
                value.type = "length")
tum.ln <- as.data.frame(tum.ln, row.names = NULL)

tn.st <- list(comparison = "TN Tumor vs Premenopausal",
                statistic = w.tn.st$statistic,
                p.value = w.tn.st$p.value,
                event = "deletion",
                value.type = "state")
tn.st <- as.data.frame(tn.st, row.names = NULL)

tn.ln <- list(comparison = "TN Tumor vs Premenopausal",
                statistic = w.tn.ln$statistic,
                p.value = w.tn.ln$p.value,
                event = "deletion",
                value.type = "length")
tn.ln <- as.data.frame(tn.ln, row.names = NULL)

norm.st <- list(comparison = "Preneoplastic vs Premenopausal",
                statistic = w.norm.st$statistic,
                p.value = w.norm.st$p.value,
                event = "deletion",
                value.type = "state")
norm.st <- as.data.frame(norm.st, row.names = NULL)

norm.ln <- list(comparison = "Preneoplastic vs Premenopausal",
                statistic = w.norm.ln$statistic,
                p.value = w.norm.ln$p.value,
                event = "deletion",
                value.type = "length")
norm.ln <- as.data.frame(norm.ln, row.names = NULL)

#set as datatable

#group results
wlx.df <- rbind(
  as.data.frame(brca.st, row.names = NULL),
  as.data.frame(brca.ln, row.names = NULL),
  as.data.frame(tum.st, row.names = NULL),              
  as.data.frame(tum.ln, row.names = NULL),
  as.data.frame(tn.st, row.names = NULL),
  as.data.frame(tn.ln, row.names = NULL),
  as.data.frame(norm.st, row.names = NULL),
  as.data.frame(norm.ln, row.names = NULL)
              )

#create table
wlx.df %>%
  kbl(caption = "Wilcoxon Rank Sum Test Results", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling() 
Wilcoxon Rank Sum Test Results
comparison statistic p.value event value.type
BRCA1 Tumor vs Preneoplastic 156334.5 0.0000000000 deletion state
BRCA1 Tumor vs Preneoplastic 157272.0 0.0000000000 deletion length
BRCA1 Tumor vs TN tumor 21199.5 0.0000000000 deletion state
BRCA1 Tumor vs TN tumor 23113.0 0.0000000000 deletion length
TN Tumor vs Premenopausal 14274.5 0.0000049611 deletion state
TN Tumor vs Premenopausal 14887.0 0.0000000974 deletion length
Preneoplastic vs Premenopausal 103299.5 0.7409962526 deletion state
Preneoplastic vs Premenopausal 101848.0 0.9706026339 deletion length

Median fold change code (BRCA1 Tumor vs TN Tumor)

plot-state: single normal sample from PC (BRCA1 vs TN)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/tn_b1_epi_output_dir_subset_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#deletions
b17del <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#generate plot
bplot <- ggplot(b17del, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
    x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
    y = "Sum of Deletions"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
bplot

state: median fold change (use data from multi-sample brca1 run)

#seperate B1 and TN
b1del <- b17del %>% mutate(genotype = cell_group_name)
b1del$genotype <- str_sub(b1del$genotype, start = 1, end = 5)
b1del <- b1del %>% filter(genotype == "TN_B1") %>%
  mutate(genotype = "BRCA1 Tumor")

#seperate B1 and TN
tndel <- b17del %>% mutate(genotype = cell_group_name)
tndel$genotype <- str_sub(tndel$genotype, start = 1, end = 5)
tndel <- tndel %>% filter(genotype != "TN_B1")
tndel$genotype <- str_sub(tndel$genotype, start = 1, end = 2)
tndel <- tndel %>% filter(genotype == "TN") %>%
  mutate(genotype = "TN Tumor")

#make numeric
b1del$`Summed States` <- as.numeric(b1del$state)
tndel$`Summed States` <- as.numeric(tndel$state)

#new df
del <- rbind(b1del, tndel)
del <- del %>% rename(sample = `Sample Name`)

# Aggregate sizes based on event type and Genotype for deletions in MB
mdn.smpl <- del %>%  
  group_by(sample, genotype) %>%
  summarise(mb.del = sum(state))

# Aggregate sizes based on event type and Genotype for deletions in MB
mdn.gntp <- del %>%  
  group_by(genotype) %>%
  summarise(mb.del = sum(state))

# Calculate the median fold change for deletion events in genotypes
st.mdn.smpl <- mdn.smpl %>%
  group_by(sample, genotype) %>%
  summarise(`median value (del)` = median(mb.del)) %>%
  ungroup() %>%
  mutate(`median fold change (del)` = `median value (del)` / `median value (del)`[genotype == "TN Tumor"])

#label as state
st.mdn.smpl <- st.mdn.smpl %>% mutate(type = "state")


#median fold change for genotypes
st.mdn.gntp <- mdn.gntp %>%
  group_by(genotype) %>%
  summarise(`median value (del)` = median(mb.del)) %>%
  ungroup() %>%
  mutate(`median fold change (del)` = `median value (del)` / `median value (del)`[genotype == "TN Tumor"])

#label as state
st.mdn.gntp <- st.mdn.gntp %>% mutate(type = "state")

plot-length: single normal sample from PC (BRCA1 vs TN)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/tn_b1_epi_output_dir_subset_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#deletions
b17del <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#violin plot
bplot <- ggplot(b17del, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
    x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
    y = "Summed Length of Deletions"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
bplot

length: median fold change (use data from multi-sample brca1 run)

#seperate B1 and PN
b1del <- b17del %>% mutate(genotype = cell_group_name)
b1del$genotype <- str_sub(b1del$genotype, start = 1, end = 5)
b1del <- b1del %>% filter(genotype == "TN_B1") %>%
  mutate(genotype = "BRCA1 Tumor")

#seperate B1 and TN
tndel <- b17del %>% mutate(genotype = cell_group_name)
tndel$genotype <- str_sub(tndel$genotype, start = 1, end = 5)
tndel <- tndel %>% filter(genotype != "TN_B1")
tndel$genotype <- str_sub(tndel$genotype, start = 1, end = 2)
tndel <- tndel %>% filter(genotype == "TN") %>%
  mutate(genotype = "TN Tumor")


#make numeric
b1del$`Summed Length` <- as.numeric(b1del$Length)
tndel$`Summed Length` <- as.numeric(tndel$Length)

#new df
del <- rbind(b1del, tndel)
del <- del %>% rename(sample = `Sample Name`)

# Aggregate sizes based on event type and Genotype for deletions in MB
mdn.smpl <- del %>%  
  group_by(sample, genotype) %>%
  summarise(mb.del = sum(Length))

# Aggregate sizes based on event type and Genotype for deletions in MB
mdn.gntp <- del %>%  
  group_by(genotype) %>%
  summarise(mb.del = sum(Length))

# Calculate the median fold change for deletion events in genotypes
ln.mdn.smpl <- mdn.smpl %>%
  group_by(sample, genotype) %>%
  summarise(`median value (del)` = median(mb.del)) %>%
  ungroup() %>%
  mutate(`median fold change (del)` = `median value (del)` / `median value (del)`[genotype == "TN Tumor"])

#label as length
ln.mdn.smpl <- ln.mdn.smpl %>% mutate(type = "length")

#median fold change for genotypes
ln.mdn.gntp <- mdn.gntp %>%
  group_by(genotype) %>%
  summarise(`median value (del)` = median(mb.del)) %>%
  ungroup() %>%
  mutate(`median fold change (del)` = `median value (del)` / `median value (del)`[genotype == "TN Tumor"])

#label as length
ln.mdn.gntp <- ln.mdn.gntp %>% mutate(type = "length")

Median fold change results table(BRCA1 Tumor vs TN Tumor)

sample: table of median fold change (BRCA1 vs TN Tumor)

#combine dfs
mdn.smpl <- rbind(st.mdn.smpl, ln.mdn.smpl)

#create table
mdn.smpl %>%
  kbl(caption = "Median Fold Change (deletion)", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling(latex_options = "striped")
Median Fold Change (deletion)
sample genotype median value (del) median fold change (del) type
TN_0106 TN Tumor 14.000000 1.000000 state
TN_0114 TN Tumor 36.000000 1.000000 state
TN_0126 TN Tumor 58.000000 1.000000 state
TN_0135 TN Tumor 23.000000 1.000000 state
TN_B1_0131 BRCA1 Tumor 247.000000 17.642857 state
TN_B1_0177 BRCA1 Tumor 882.000000 24.500000 state
TN_B1_0554 BRCA1 Tumor 228.000000 3.931034 state
TN_B1_4031 BRCA1 Tumor 179.000000 7.782609 state
TN_0106 TN Tumor 0.542268 1.000000 length
TN_0114 TN Tumor 0.755306 1.000000 length
TN_0126 TN Tumor 0.799910 1.000000 length
TN_0135 TN Tumor 0.144390 1.000000 length
TN_B1_0131 BRCA1 Tumor 10.694378 19.721573 length
TN_B1_0177 BRCA1 Tumor 10.583952 14.012800 length
TN_B1_0554 BRCA1 Tumor 3.500788 4.376477 length
TN_B1_4031 BRCA1 Tumor 0.997269 6.906773 length

genotype: table of median fold change (BRCA1 vs TN Tumor)

#combine dfs
mdn.gntp <- rbind(st.mdn.gntp, ln.mdn.gntp)

#create table
mdn.gntp %>%
  kbl(caption = "Median Fold Change (deletion)", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling(latex_options = "striped")
Median Fold Change (deletion)
genotype median value (del) median fold change (del) type
BRCA1 Tumor 1536.000000 11.72519 state
TN Tumor 131.000000 1.00000 state
BRCA1 Tumor 25.776387 11.49770 length
TN Tumor 2.241874 1.00000 length

Median fold change (BRCA1 Preneoplastic vs Human Premenopausal)

plot-state: single normal sample from PC (Pre-neo vs Human Premeno)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]

#deletions
pn17del <- pn17 %>% 
  mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>%
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>%
  distinct(cell_group_name, .keep_all = TRUE)

#load data
tp17 <- read.table(file = "~/brca-infercnv/norm_output_dir_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]

#deletions
tp17del <- tp17 %>% 
  mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>%
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>%
  distinct(cell_group_name, .keep_all = TRUE) %>%
  filter(`Sample Name` != "N_1105")

#combine dfs
norm17del <- rbind(pn17del, tp17del)

#generate plot
normplot <- ggplot(norm17del, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
    x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
    y = "Sum of Deletions"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
normplot

state: median fold change (use data from multi-sample tn run)

#seperate B1 and PN
pndel <- norm17del %>% mutate(genotype = cell_group_name)
pndel$genotype <- str_sub(pndel$genotype, start = 1, end = 2)
pndel <- pndel %>% filter(genotype == "B1") %>%
  mutate(genotype = "BRCA1 preneoplastic")

tpdel <- norm17del %>% mutate(genotype = cell_group_name)
tpdel <- tpdel %>% filter(`Sample Name` != "N_1105_epi")
tpdel$genotype <- str_sub(tpdel$genotype, start = 1, end = 1)
tpdel <- tpdel %>% filter(genotype == "N") %>%
  mutate(genotype = "premenopausal")

#make numeric
pndel$`Summed States` <- as.numeric(pndel$state)
tpdel$`Summed States` <- as.numeric(tpdel$state)

#new df
del <- rbind(pndel, tpdel)
del <- del %>% rename(sample = `Sample Name`)

# Aggregate sizes based on event type and Genotype for deletions in MB
mdn.smpl <- del %>%  
  group_by(sample, genotype) %>%
  summarise(mb.del = sum(state))

# Aggregate sizes based on event type and Genotype for deletions in MB
mdn.gntp <- del %>%  
  group_by(genotype) %>%
  summarise(mb.del = sum(state))

# Calculate the median fold change for deletion events in genotypes
st.mdn.smpl <- mdn.smpl %>%
  group_by(sample, genotype) %>%
  summarise(`median value (del)` = median(mb.del)) %>%
  ungroup() %>%
  mutate(`median fold change (del)` = `median value (del)` / `median value (del)`[genotype == "premenopausal"])

#label as state
st.mdn.smpl <- st.mdn.smpl %>% mutate(type = "state")

#median fold change for genotypes
st.mdn.gntp <- mdn.gntp %>%
  group_by(genotype) %>%
  summarise(`median value (del)` = median(mb.del)) %>%
  ungroup() %>%
  mutate(`median fold change (del)` = `median value (del)` / `median value (del)`[genotype == "premenopausal"])

#label as state
st.mdn.gntp <- st.mdn.gntp %>% mutate(type = "state")

plot-length: single normal sample from PC (Preneo vs Human Premeno)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                   col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]

#deletions
pn17del <- pn17 %>% 
  mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE) 

#load data
tp17 <- read.table(file = "~/brca-infercnv/norm_output_dir_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                   col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]

#deletions
tp17del <- tp17 %>% 
  mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>% 
  filter(state < 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE) %>%
  filter(`Sample Name` != "N_1105")

#combine dfs
norm17del <- rbind(pn17del, tp17del)

#violin plot
normplot <- ggplot(norm17del, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Deletions",
    x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
    y = "Summed Length of Deletions"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
normplot

length: median fold change (use data from multi-sample tn run)

#seperate TP and PN
pndel <- norm17del %>% mutate(genotype = cell_group_name)
pndel$genotype <- str_sub(pndel$genotype, start = 1, end = 2)
pndel <- pndel %>% filter(genotype == "B1") %>%
  mutate(genotype = "BRCA1 preneoplastic")

#seperate TP and PN
tpdel <- norm17del %>% mutate(genotype = cell_group_name)
tpdel <- tpdel %>% filter(`Sample Name` != "N_1105")
tpdel$genotype <- str_sub(tpdel$genotype, start = 1, end = 1)
tpdel <- tpdel %>% filter(genotype == "N") %>%
  mutate(genotype = "premenopausal")

#make numeric
pndel$`Summed Length` <- as.numeric(pndel$Length)
tpdel$`Summed Length` <- as.numeric(tpdel$Length)

#new df
del <- rbind(pndel, tpdel)
del <- del %>% rename(sample = `Sample Name`)

# Aggregate sizes based on event type and Genotype for deletions in MB
mdn.smpl <- del %>%  
  group_by(sample, genotype) %>%
  summarise(mb.del = sum(Length))

# Aggregate sizes based on event type and Genotype for deletions in MB
mdn.gntp <- del %>%  
  group_by(genotype) %>%
  summarise(mb.del = sum(Length))

# Calculate the median fold change for deletion events in genotypes
ln.mdn.smpl <- mdn.smpl %>%
  group_by(sample, genotype) %>%
  summarise(`median value (del)` = median(mb.del)) %>%
  ungroup() %>%
  mutate(`median fold change (del)` = `median value (del)` / `median value (del)`[genotype == "premenopausal"])

#label as state
ln.mdn.smpl <- ln.mdn.smpl %>% mutate(type = "length")

#median fold change for genotypes
ln.mdn.gntp <- mdn.gntp %>%
  group_by(genotype) %>%
  summarise(`median value (del)` = median(mb.del)) %>%
  ungroup() %>%
  mutate(`median fold change (del)` = `median value (del)` / `median value (del)`[genotype == "premenopausal"])

#label as state
ln.mdn.gntp <- ln.mdn.gntp %>% mutate(type = "length")

Median fold change results table (BRCA1 Preneoplastic vs Human Premenopausal)

sample: table of median fold change (Pre-neo vs Premenopausal)

#combine dfs
mdn.smpl <- rbind(st.mdn.smpl, ln.mdn.smpl)

#create table
mdn.smpl %>%
  kbl(caption = "Median Fold Change (deletion)", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling(latex_options = "striped")
Median Fold Change (deletion)
sample genotype median value (del) median fold change (del) type
B1_0023 BRCA1 preneoplastic 282.000000 1.2260870 state
B1_0033 BRCA1 preneoplastic 412.000000 15.8461538 state
B1_0090 BRCA1 preneoplastic 344.000000 3.2452830 state
B1_0894 BRCA1 preneoplastic 493.000000 2.9345238 state
N_0019 premenopausal 230.000000 0.9663866 state
N_0064 premenopausal 26.000000 0.2000000 state
N_0092 premenopausal 106.000000 1.1276596 state
N_0093 premenopausal 168.000000 1.1351351 state
N_0123 premenopausal 238.000000 1.0347826 state
N_0169 premenopausal 130.000000 5.0000000 state
N_0230 premenopausal 94.000000 0.8867925 state
N_0233 premenopausal 148.000000 0.8809524 state
B1_0023 BRCA1 preneoplastic 2.600092 0.6655649 length
B1_0033 BRCA1 preneoplastic 7.070205 31.9199496 length
B1_0090 BRCA1 preneoplastic 4.776963 2.6801351 length
B1_0894 BRCA1 preneoplastic 4.210179 1.4753565 length
N_0019 premenopausal 3.906594 1.8818688 length
N_0064 premenopausal 0.221498 0.1137325 length
N_0092 premenopausal 1.782359 0.8670352 length
N_0093 premenopausal 2.853669 1.0230753 length
N_0123 premenopausal 2.075912 2.1526898 length
N_0169 premenopausal 1.947534 0.4985248 length
N_0230 premenopausal 2.055694 9.2808694 length
N_0233 premenopausal 2.789305 1.5649513 length
N_1105_epi premenopausal 0.964334 0.3379278 length

genotype: table of median fold change (Pre-neo vs Premenopausal)

#combine dfs
mdn.gntp <- rbind(st.mdn.gntp, ln.mdn.gntp)

#create table
mdn.gntp %>%
  kbl(caption = "Median Fold Change (deletion)", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling(latex_options = "striped")
Median Fold Change (deletion)
genotype median value (del) median fold change (del) type
BRCA1 preneoplastic 1531.00000 1.342982 state
premenopausal 1140.00000 1.000000 state
BRCA1 preneoplastic 18.65744 1.003255 length
premenopausal 18.59690 1.000000 length